library(pacman)
library(ggplot2)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.3 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ lubridate 1.9.2 ✔ tibble 3.2.1
## ✔ purrr 1.0.2 ✔ tidyr 1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(janitor)
##
## Attaching package: 'janitor'
##
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
library(lubridate)
library(gmodels)
library(plotly)
##
## Attaching package: 'plotly'
##
## The following object is masked from 'package:ggplot2':
##
## last_plot
##
## The following object is masked from 'package:stats':
##
## filter
##
## The following object is masked from 'package:graphics':
##
## layout
library(reshape2)
##
## Attaching package: 'reshape2'
##
## The following object is masked from 'package:tidyr':
##
## smiths
library(dplyr)
library(purrr)
library(knitr)
library(kableExtra)
##
## Attaching package: 'kableExtra'
##
## The following object is masked from 'package:dplyr':
##
## group_rows
library(mice)
##
## Attaching package: 'mice'
##
## The following object is masked from 'package:stats':
##
## filter
##
## The following objects are masked from 'package:base':
##
## cbind, rbind
dataset <- read.csv("zomato_dataset.csv")
str(dataset)
## 'data.frame': 123657 obs. of 12 variables:
## $ Restaurant.Name: chr "Doner King" "Doner King" "Doner King" "Doner King" ...
## $ Dining.Rating : num 3.9 3.9 3.9 3.9 3.9 3.9 3.9 3.9 3.9 3.9 ...
## $ Delivery.Rating: num 4.2 4.2 4.2 4.2 4.2 4.2 4.2 4.2 4.2 4.2 ...
## $ Dining.Votes : int 39 39 39 39 39 39 39 39 39 39 ...
## $ Delivery.Votes : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Cuisine : chr "Fast Food" "Fast Food" "Fast Food" "Fast Food" ...
## $ Place.Name : chr "Malakpet" "Malakpet" "Malakpet" "Malakpet" ...
## $ City : chr " Hyderabad" " Hyderabad" " Hyderabad" " Hyderabad" ...
## $ Item.Name : chr "Platter Kebab Combo" "Chicken Rumali Shawarma" "Chicken Tandoori Salad" "Chicken BBQ Salad" ...
## $ Best.Seller : chr "BESTSELLER" "BESTSELLER" "" "BESTSELLER" ...
## $ Votes : int 84 45 39 43 31 48 27 59 29 31 ...
## $ Prices : num 249 129 189 189 205 199 165 165 115 129 ...
dataset <- clean_names(dataset)
summary(dataset)
## restaurant_name dining_rating delivery_rating dining_votes
## Length:123657 Min. :2.50 Min. :2.500 Min. : 0.0
## Class :character 1st Qu.:3.60 1st Qu.:3.800 1st Qu.: 0.0
## Mode :character Median :3.90 Median :4.000 Median : 30.0
## Mean :3.82 Mean :3.963 Mean :152.7
## 3rd Qu.:4.10 3rd Qu.:4.100 3rd Qu.:217.0
## Max. :4.80 Max. :4.600 Max. :997.0
## NA's :32236 NA's :1280
## delivery_votes cuisine place_name city
## Min. : 0.0 Length:123657 Length:123657 Length:123657
## 1st Qu.: 0.0 Class :character Class :character Class :character
## Median : 0.0 Mode :character Mode :character Mode :character
## Mean :115.8
## 3rd Qu.: 23.0
## Max. :983.0
##
## item_name best_seller votes prices
## Length:123657 Length:123657 Min. : 0.00 Min. : 0.95
## Class :character Class :character 1st Qu.: 0.00 1st Qu.: 130.00
## Mode :character Mode :character Median : 0.00 Median : 208.57
## Mean : 24.67 Mean : 241.38
## 3rd Qu.: 15.00 3rd Qu.: 299.00
## Max. :9750.00 Max. :12024.00
##
mice() to treat NA in
dining_rating and dining_votesimpute_data <- dataset
# Assuming "Dining.Rating" is the column to be imputed
vars_to_impute <- c("dining_rating","delivery_rating")
impute_mice <- mice(impute_data[vars_to_impute], method = "pmm", m = 5)
##
## iter imp variable
## 1 1 dining_rating delivery_rating
## 1 2 dining_rating delivery_rating
## 1 3 dining_rating delivery_rating
## 1 4 dining_rating delivery_rating
## 1 5 dining_rating delivery_rating
## 2 1 dining_rating delivery_rating
## 2 2 dining_rating delivery_rating
## 2 3 dining_rating delivery_rating
## 2 4 dining_rating delivery_rating
## 2 5 dining_rating delivery_rating
## 3 1 dining_rating delivery_rating
## 3 2 dining_rating delivery_rating
## 3 3 dining_rating delivery_rating
## 3 4 dining_rating delivery_rating
## 3 5 dining_rating delivery_rating
## 4 1 dining_rating delivery_rating
## 4 2 dining_rating delivery_rating
## 4 3 dining_rating delivery_rating
## 4 4 dining_rating delivery_rating
## 4 5 dining_rating delivery_rating
## 5 1 dining_rating delivery_rating
## 5 2 dining_rating delivery_rating
## 5 3 dining_rating delivery_rating
## 5 4 dining_rating delivery_rating
## 5 5 dining_rating delivery_rating
imputed_data <- complete(impute_mice)
# Merge imputed data back into the original dataset
dataset <- merge(dataset, imputed_data, by = "row.names", all.x = TRUE)
# Remove the duplicated row.names column
dataset <- dataset[, -1]
summary(dataset)
## restaurant_name dining_rating.x delivery_rating.x dining_votes
## Length:123657 Min. :2.50 Min. :2.500 Min. : 0.0
## Class :character 1st Qu.:3.60 1st Qu.:3.800 1st Qu.: 0.0
## Mode :character Median :3.90 Median :4.000 Median : 30.0
## Mean :3.82 Mean :3.963 Mean :152.7
## 3rd Qu.:4.10 3rd Qu.:4.100 3rd Qu.:217.0
## Max. :4.80 Max. :4.600 Max. :997.0
## NA's :32236 NA's :1280
## delivery_votes cuisine place_name city
## Min. : 0.0 Length:123657 Length:123657 Length:123657
## 1st Qu.: 0.0 Class :character Class :character Class :character
## Median : 0.0 Mode :character Mode :character Mode :character
## Mean :115.8
## 3rd Qu.: 23.0
## Max. :983.0
##
## item_name best_seller votes prices
## Length:123657 Length:123657 Min. : 0.00 Min. : 0.95
## Class :character Class :character 1st Qu.: 0.00 1st Qu.: 130.00
## Mode :character Mode :character Median : 0.00 Median : 208.57
## Mean : 24.67 Mean : 241.38
## 3rd Qu.: 15.00 3rd Qu.: 299.00
## Max. :9750.00 Max. :12024.00
##
## dining_rating.y delivery_rating.y
## Min. :2.500 Min. :2.500
## 1st Qu.:3.600 1st Qu.:3.800
## Median :3.900 Median :4.000
## Mean :3.817 Mean :3.964
## 3rd Qu.:4.100 3rd Qu.:4.100
## Max. :4.800 Max. :4.600
##
old_columns <- c("dining_rating.x", "delivery_rating.x")
# Remove old columns
dataset <- dataset[, !(names(dataset) %in% old_columns)]
dataset <- clean_names(dataset)
summary(dataset)
## restaurant_name dining_votes delivery_votes cuisine
## Length:123657 Min. : 0.0 Min. : 0.0 Length:123657
## Class :character 1st Qu.: 0.0 1st Qu.: 0.0 Class :character
## Mode :character Median : 30.0 Median : 0.0 Mode :character
## Mean :152.7 Mean :115.8
## 3rd Qu.:217.0 3rd Qu.: 23.0
## Max. :997.0 Max. :983.0
## place_name city item_name best_seller
## Length:123657 Length:123657 Length:123657 Length:123657
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## votes prices dining_rating_y delivery_rating_y
## Min. : 0.00 Min. : 0.95 Min. :2.500 Min. :2.500
## 1st Qu.: 0.00 1st Qu.: 130.00 1st Qu.:3.600 1st Qu.:3.800
## Median : 0.00 Median : 208.57 Median :3.900 Median :4.000
## Mean : 24.67 Mean : 241.38 Mean :3.817 Mean :3.964
## 3rd Qu.: 15.00 3rd Qu.: 299.00 3rd Qu.:4.100 3rd Qu.:4.100
## Max. :9750.00 Max. :12024.00 Max. :4.800 Max. :4.600
We can see that dining_rating and delivery_rating have no NA values.
Checking if we have any other columns with NA values
colSums(is.na(dataset))
## restaurant_name dining_votes delivery_votes cuisine
## 0 0 0 0
## place_name city item_name best_seller
## 0 0 0 0
## votes prices dining_rating_y delivery_rating_y
## 0 0 0 0
dataset <- dataset %>%
mutate(average_rating = (dining_rating_y + delivery_rating_y) / 2)
# Creating histogram for distribution of dining ratings
gg_dining <- ggplot(dataset, aes(x = dining_rating_y)) +
geom_histogram(binwidth = 0.1, fill = "blue", color = "black", alpha = 0.7) +
labs(title = "Distribution of Dining Ratings", x = "Dining Ratings", y = "Frequency") +
theme_minimal()
# Convert ggplot to Plotly
plotly_dining <- ggplotly(gg_dining)
#Displaying the plot
plotly_dining
# Create a ggplot for delivery ratings
gg_delivery <- ggplot(dataset, aes(x = delivery_rating_y)) +
geom_histogram(binwidth = 0.1, fill = "green", color = "black", alpha = 0.7) +
labs(title = "Distribution of Delivery Ratings", x = "Delivery Ratings", y = "Frequency") +
theme_minimal()
# Convert ggplot to Plotly
plotly_delivery <- ggplotly(gg_delivery)
# Displaying plot
plotly_delivery
# Creating histogram for distribution of pricing
gg_prices <- ggplot(dataset, aes(x = prices)) +
geom_histogram(binwidth = 100, fill = "blue", color = "black", alpha = 0.7) +
labs(title = "Distribution of food prices", x = "Price", y = "Frequency") +
theme_minimal()
# Convert ggplot to Plotly
plotly_prices <- ggplotly(gg_prices)
plotly_prices
# Creating scatter plot uing ggplot()
gg_price_avg_rating <- ggplot(dataset, aes(x = average_rating, y = prices)) +
geom_point(aes(color = "blue"), size = 3, alpha = 0.7) + labs(title = "Scatter Plot: Prices vs. Average Ratings",
x = "Average Ratings",
y = "Prices") +
theme_minimal()
gg_price_avg_rating
city_restaurant_counts <- dataset %>%
group_by(city) %>%
summarize(restaurant_count = n())
city_restaurant_counts
## # A tibble: 17 × 2
## city restaurant_count
## <chr> <int>
## 1 " Ahmedabad" 10178
## 2 " Banaswadi" 97
## 3 " Bangalore" 12040
## 4 " Chennai" 13100
## 5 " Goa" 2360
## 6 " Hyderabad" 15613
## 7 " Jaipur" 14438
## 8 " Kochi" 7759
## 9 " Kolkata" 8662
## 10 " Lucknow" 6455
## 11 " Magrath Road" 54
## 12 " Malleshwaram" 43
## 13 " Mumbai" 13535
## 14 " New Delhi" 3490
## 15 " Pune" 8067
## 16 " Raipur" 7700
## 17 " Ulsoor" 66
gsub().# Using gsub() to replace
dataset$city <- gsub("Banaswadi|Magrath Road|Malleshwaram|Ulsoor", "Bangalore", dataset$city)
unique_cities <- unique(dataset$city)
# Print the unique values
print(unique_cities)
## [1] " Hyderabad" " Lucknow" " New Delhi" " Kolkata" " Raipur"
## [6] " Mumbai" " Chennai" " Pune" " Jaipur" " Kochi"
## [11] " Goa" " Bangalore" " Ahmedabad"
city_restaurant_counts <- dataset %>%
group_by(city) %>%
summarize(restaurant_count = n())
city_restaurant_counts
## # A tibble: 13 × 2
## city restaurant_count
## <chr> <int>
## 1 " Ahmedabad" 10178
## 2 " Bangalore" 12300
## 3 " Chennai" 13100
## 4 " Goa" 2360
## 5 " Hyderabad" 15613
## 6 " Jaipur" 14438
## 7 " Kochi" 7759
## 8 " Kolkata" 8662
## 9 " Lucknow" 6455
## 10 " Mumbai" 13535
## 11 " New Delhi" 3490
## 12 " Pune" 8067
## 13 " Raipur" 7700
# Using ggplot() to plot a bar chart
gg_rest_city_counts <- ggplot(city_restaurant_counts, aes(x = city, y = restaurant_count)) +
geom_bar(stat = "identity", fill = "skyblue", color = "black", alpha = 0.7) +
labs(title = "Number of restaurants in each city", x = "City", y = "Number of Restaurants") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) # Rotate x-axis labels for better readability
# Using plotly() to create an interactive chart
plotly_rest_counts <- ggplotly(gg_rest_city_counts)
plotly_rest_counts
top_restaurants <- dataset %>%
group_by(restaurant_name) %>%
summarize(count = n()) %>%
top_n(10, count)
# Print the top 10 restaurants
print(top_restaurants)
## # A tibble: 10 × 2
## restaurant_name count
## <chr> <int>
## 1 Burger King 1352
## 2 Domino's Pizza 1435
## 3 FreshMenu 1097
## 4 KFC 751
## 5 Kanha 934
## 6 La Pino'z Pizza 868
## 7 McDonald's 2059
## 8 Pizza Hut 1077
## 9 Subway 823
## 10 The Momoz Hub 700
# Using ggplot() to plot bar chart
ggplot(top_restaurants, aes(x = reorder(restaurant_name, -count), y = count)) +
geom_bar(stat = "identity", fill = "skyblue", color = "black") +
labs(title = "Top 10 Restaurants by Count",
x = "Restaurant Name",
y = "Total Votes") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
* McDonald’s has the highest number of outlets followed
by Domino’s and Burger King
delhi_locations <- unique(dataset$place_name[grepl("delhi", dataset$city, ignore.case = TRUE)])
# Print the unique locations in Delhi
# Print the unique locations in Delhi
print(delhi_locations)
## [1] "Connaught Place" "Cyber Hub"
## [3] "Lajpat Nagar 2" "Laxmi Nagar"
## [5] "Delhi University-GTB Nagar" "Mukherjee Nagar"
## [7] "Geeta Colony" "Inderlok"
## [9] "Pandav Nagar" "Rajinder Nagar"
## [11] "Karol Bagh" "Greater Kailash 1 (GK1)"
## [13] "Gole Market" "Rajouri Garden"
## [15] "Kamla Nagar" "West Patel Nagar"
delhi_data <- subset(dataset, grepl("delhi", dataset$city, ignore.case = TRUE))
# Find the top 5 locations by restaurant count
top_locations <- delhi_data %>%
group_by(place_name) %>%
summarise(restaurant_count = n()) %>%
arrange(desc(restaurant_count)) %>%
head(5)
# Print the top 5 locations
print(top_locations)
## # A tibble: 5 × 2
## place_name restaurant_count
## <chr> <int>
## 1 Laxmi Nagar 692
## 2 Connaught Place 479
## 3 Inderlok 420
## 4 Delhi University-GTB Nagar 309
## 5 Gole Market 267
# Visualization using ggplot2
library(ggplot2)
# Bar plot for the top 5 locations
ggplot(top_locations, aes(x = reorder(place_name, -restaurant_count), y = restaurant_count)) +
geom_bar(stat = "identity", fill = "skyblue") +
labs(title = "Top 5 Locations in Delhi by Restaurant Count",
x = "Location",
y = "Restaurant Count") +
theme_minimal()
* Laxmi Nagar has the highest number of restaurants in
New Delhi followed by Connaught place
unique(dataset$cuisine)
## [1] "Fast Food" "Wraps" "Rolls" "Beverages"
## [5] "Desserts" "Awadhi" "Bakery" "Pizza"
## [9] "Mughlai" "Coffee" "American" "Street Food"
## [13] "Shake" "Biryani" "Pasta" "Burger"
## [17] "Chinese" "Momos" "Tea" "Sandwich"
## [21] "North Indian" "Mithai" "Mandi" "Seafood"
## [25] "Hyderabadi" "Salad" "Sichuan" "South Indian"
## [29] "Kebab" "Italian" "Lucknowi" "Continental"
## [33] "Mexican" "Kerala" "BBQ" "Rajasthani"
## [37] "Ice Cream" "Healthy Food" "Tibetan" "Juices"
## [41] "Shawarma" "Maharashtrian" "Thai" "Vietnamese"
## [45] "Gujarati" "Kathiyawadi" "Turkish" "Andhra"
cuisine_mapping <- list(
"Indian" = c("Awadhi", "Biryani", "Rajasthani", "Gujarati", "Kathiyawadi", "Maharashtrian", "Mughlai", "North Indian", "Lucknowi", "South Indian", "Hyderabadi", "Kebab", "Kerala", "Andhra"),
"Asian" = c("Chinese", "Sichuan", "Momos", "Thai", "Vietnamese", "Tibetan"),
"Western" = c("Pizza", "Pasta", "American", "Continental", "Mexican", "BBQ", "Italian"),
"Fast Food" = c("Fast Food", "Burger", "Sandwich", "Street Food", "Shawarma"),
"Desserts" = c("Desserts", "Mithai", "Bakery", "Ice Cream"),
"Beverages" = c("Beverages", "Coffee", "Tea", "Juices"),
"Healthy Food" = c("Healthy Food", "Salad"),
"Other" = c("Mandi", "Seafood", "Turkish")
)
# Function to map specific cuisines to broader categories
map_to_category <- function(cuisine) {
for (category in names(cuisine_mapping)) {
if (cuisine %in% cuisine_mapping[[category]]) {
return(category)
}
}
return("Other")
}
# Apply the mapping function to create a new column 'category'
dataset$category <- sapply(dataset$cuisine, map_to_category)
# Print unique values from the resulting category column
print(unique(dataset$category))
## [1] "Fast Food" "Other" "Beverages" "Desserts" "Indian"
## [6] "Western" "Asian" "Healthy Food"
gg_cuisine <- ggplot(dataset, aes(x = category)) +
geom_bar(fill = "skyblue", color = "black") +
labs(title = "Distribution of Cuisines",
x = "Cuisine",
y = "Count") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
# Convert ggplot to interactive plot using plot_ly
interactive_cuisine <- ggplotly(gg_cuisine)
# Show the interactive plot
interactive_cuisine
summary_table <- dataset %>%
summarise(across(where(is.numeric),
list(
Mean = ~ mean(., na.rm = TRUE),
SD = ~ sd(., na.rm = TRUE),
Min = ~ min(., na.rm = TRUE),
Max = ~ max(., na.rm = TRUE),
Total = ~ sum(!is.na(.))
)))
# Transpose the summary table
summary_table_transposed <- t(summary_table)
# Format the table to display numbers without scientific notation
formatted_summary_table <- format(summary_table_transposed, scientific = FALSE)
# Create a kable table
kable(formatted_summary_table,
caption = "Descriptive Statistics for dataset",
format = "html") %>%
kable_styling(full_width = FALSE)
| dining_votes_Mean | 152.7298576 |
| dining_votes_SD | 232.2140614 |
| dining_votes_Min | 0.0000000 |
| dining_votes_Max | 997.0000000 |
| dining_votes_Total | 123657.0000000 |
| delivery_votes_Mean | 115.7637255 |
| delivery_votes_SD | 243.9708277 |
| delivery_votes_Min | 0.0000000 |
| delivery_votes_Max | 983.0000000 |
| delivery_votes_Total | 123657.0000000 |
| votes_Mean | 24.6667718 |
| votes_SD | 125.2360091 |
| votes_Min | 0.0000000 |
| votes_Max | 9750.0000000 |
| votes_Total | 123657.0000000 |
| prices_Mean | 241.3783986 |
| prices_SD | 192.8307128 |
| prices_Min | 0.9500000 |
| prices_Max | 12024.0000000 |
| prices_Total | 123657.0000000 |
| dining_rating_y_Mean | 3.8174766 |
| dining_rating_y_SD | 0.4110176 |
| dining_rating_y_Min | 2.5000000 |
| dining_rating_y_Max | 4.8000000 |
| dining_rating_y_Total | 123657.0000000 |
| delivery_rating_y_Mean | 3.9635354 |
| delivery_rating_y_SD | 0.2458464 |
| delivery_rating_y_Min | 2.5000000 |
| delivery_rating_y_Max | 4.6000000 |
| delivery_rating_y_Total | 123657.0000000 |
| average_rating_Mean | 3.8905060 |
| average_rating_SD | 0.2729674 |
| average_rating_Min | 2.7500000 |
| average_rating_Max | 4.6500000 |
| average_rating_Total | 123657.0000000 |
2.Data Cleaning:
3.Visualizations:
4.Insights:
5.City-Specific Analysis:
6.Cuisine Analysis:
7.Descriptive Statistics: